The main purpose of our project is to predict 6 weeks of daily sales of Rossmann for 1,115 stores located across Germany on Kaggle competition.
The dataset consists with 10e6 rows x 13 columns. For the purpose of the project we added historical dataset “Store” and mergerd two datasets by the variable “Store”. In the next stage step such variables as month, year, week were created. Finally, we got the main dataset with size 384895 observations (rows) and 28 variables (columns).
packages <- c('lmtest','forecast','xts','fBasics','urca','lubridate','tidyr',
'dplyr','readr','data.table','ggplot2','zoo')
for(i in packages){
if(i %in% rownames(installed.packages())){
suppressWarnings(suppressMessages(library(i,character.only = T)))
}else{
install.packages(i, character.only = T)
suppressWarnings(library(i))
}
}
rossmann <- data.table::fread("all/train.csv")
store <- data.table::fread("all/store.csv")
rossmann <- merge(rossmann, store, by = "Store")
rossmann$Date <- as.Date(rossmann$Date, "%Y-%m-%d")
rossmann$Month <- month(rossmann$Date)
rossmann$Year <- year(rossmann$Date)
# create data for ARIMA analysis
rossmann1 <- rossmann
rossmann <- rossmann%>%
arrange(Store,Date)
## Warning: package 'bindrcpp' was built under R version 3.4.4
Describtions of variables:
Sales: the turnover for any given day (this is what you are predicting) Customers: the number of customers on a given day Open: an indicator for whether the store was open: 0 = closed, 1 = open StateHoliday: indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays. Note that all schools are closed on public holidays and weekends. a = public holiday, b = Easter holiday, c = Christmas, 0 = None SchoolHoliday: indicates if the (Store, Date) was affected by the closure of public schools StoreType: differentiates between 4 different store models: a, b, c, d Promo: indicates whether a store is running a promo on that day DayOfWeek Date Assortment: describes an assortment level: a = basic, b = extra, c = extended CompetitionDistance: distance in meters to the nearest competitor store CompetitionOpenSince: gives the approximate year and month of the time the nearest competitor was opened Promo2 - Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating PromoInterval: describes the consecutive intervals Promo2 is started, naming the months the promotion is started anew. E.g. “Feb,May,Aug,Nov” means each round starts in February, May, August, November of any given year for that store.
# Change the type of some variables
rossmann$Store <- as.numeric(rossmann$Store)
rossmann$DayOfWeek <- as.numeric(rossmann$DayOfWeek)
rossmann$Sales <- as.numeric(rossmann$Sales)
rossmann$Customers <- as.numeric(rossmann$Customers)
rossmann$Open <- as.numeric(rossmann$Open)
rossmann$Promo <- as.numeric(rossmann$Promo)
rossmann$StateHoliday <- as.factor(rossmann$StateHoliday)
rossmann$SchoolHoliday <- as.numeric(rossmann$SchoolHoliday)
rossmann$StoreType <- as.factor(rossmann$StoreType)
rossmann$Assortment <- as.factor(rossmann$Assortment)
rossmann$CompetitionDistance <- as.numeric(rossmann$CompetitionDistance)
rossmann$CompetitionOpenSinceMonth <- as.numeric(rossmann$CompetitionOpenSinceMonth)
rossmann$CompetitionOpenSinceYear <- as.numeric(rossmann$CompetitionOpenSinceYear)
rossmann$Promo2 <- as.numeric(rossmann$Promo2)
rossmann$Promo2SinceWeek <- as.numeric(rossmann$Promo2SinceWeek)
rossmann$Promo2SinceYear <- as.numeric(rossmann$Promo2SinceYear)
rossmann$PromoInterval <- as.factor(rossmann$PromoInterval)
rossmann$Month <- as.numeric(rossmann$Month)
rossmann$Year <- as.numeric(rossmann$Year)
summary(rossmann)
## Store DayOfWeek Date Sales
## Min. : 1.0 Min. :1.000 Min. :2013-01-01 Min. : 0
## 1st Qu.: 280.0 1st Qu.:2.000 1st Qu.:2013-08-17 1st Qu.: 3727
## Median : 558.0 Median :4.000 Median :2014-04-02 Median : 5744
## Mean : 558.4 Mean :3.998 Mean :2014-04-11 Mean : 5774
## 3rd Qu.: 838.0 3rd Qu.:6.000 3rd Qu.:2014-12-12 3rd Qu.: 7856
## Max. :1115.0 Max. :7.000 Max. :2015-07-31 Max. :41551
##
## Customers Open Promo StateHoliday
## Min. : 0.0 Min. :0.0000 Min. :0.0000 0:986159
## 1st Qu.: 405.0 1st Qu.:1.0000 1st Qu.:0.0000 a: 20260
## Median : 609.0 Median :1.0000 Median :0.0000 b: 6690
## Mean : 633.1 Mean :0.8301 Mean :0.3815 c: 4100
## 3rd Qu.: 837.0 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :7388.0 Max. :1.0000 Max. :1.0000
##
## SchoolHoliday StoreType Assortment CompetitionDistance
## Min. :0.0000 a:551627 a:537445 Min. : 20
## 1st Qu.:0.0000 b: 15830 b: 8294 1st Qu.: 710
## Median :0.0000 c:136840 c:471470 Median : 2330
## Mean :0.1786 d:312912 Mean : 5430
## 3rd Qu.:0.0000 3rd Qu.: 6890
## Max. :1.0000 Max. :75860
## NA's :2642
## CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2
## Min. : 1.0 Min. :1900 Min. :0.0000
## 1st Qu.: 4.0 1st Qu.:2006 1st Qu.:0.0000
## Median : 8.0 Median :2010 Median :1.0000
## Mean : 7.2 Mean :2009 Mean :0.5006
## 3rd Qu.:10.0 3rd Qu.:2013 3rd Qu.:1.0000
## Max. :12.0 Max. :2015 Max. :1.0000
## NA's :323348 NA's :323348
## Promo2SinceWeek Promo2SinceYear PromoInterval
## Min. : 1.0 Min. :2009 :508031
## 1st Qu.:13.0 1st Qu.:2011 Feb,May,Aug,Nov :118596
## Median :22.0 Median :2012 Jan,Apr,Jul,Oct :293122
## Mean :23.3 Mean :2012 Mar,Jun,Sept,Dec: 97460
## 3rd Qu.:37.0 3rd Qu.:2013
## Max. :50.0 Max. :2015
## NA's :508031 NA's :508031
## Month Year
## Min. : 1.000 Min. :2013
## 1st Qu.: 3.000 1st Qu.:2013
## Median : 6.000 Median :2014
## Mean : 5.847 Mean :2014
## 3rd Qu.: 8.000 3rd Qu.:2014
## Max. :12.000 Max. :2015
##
# investigate the distribution of the number of observation
# of stores
rossmann%>%group_by(Store)%>%
summarise(days = n(),
start_date = min(Date),
end_date = max(Date))%>%
group_by(start_date,end_date,days)%>%
summarise(no_store = n())
## # A tibble: 3 x 4
## # Groups: start_date, end_date [?]
## start_date end_date days no_store
## <date> <date> <int> <int>
## 1 2013-01-01 2015-07-31 758 180
## 2 2013-01-01 2015-07-31 942 934
## 3 2013-01-02 2015-07-31 941 1
What is interesting, that there are two groups of store with significant different number of observations (758 vs 942 observations). So, we decided to investigate further!
180 stores do not have the data during 6 months end of 2014.
Lets investigate sunday sales.
rossmann%>%
filter(DayOfWeek==7)%>%
group_by( Store)%>%
summarise(Sales = sum(Sales))%>%
ungroup()%>%
mutate(Sunday_sales = ifelse(Sales >0,1,0))%>%
group_by(Sunday_sales)%>%
summarise(Store_count = n())
## # A tibble: 2 x 2
## Sunday_sales Store_count
## <dbl> <int>
## 1 0 1082
## 2 1 33
33 stores have sunday sales. We added this attribute to the data set.
180 stores do not have the data during 6 months end of 2014.
Let’s see the number of stores by missing data and no sunday sales group.
#--> 180 stores do not have the data during 6 months end of 2014
rossmann%>%
dplyr::filter(DayOfWeek==7)%>%
group_by( Store)%>%
summarise(Sales = sum(Sales))%>%
ungroup()%>%
mutate(Sunday_sales = ifelse(Sales >0,1,0))%>%
group_by(Sunday_sales)%>%
summarise(Store_count = n())
## # A tibble: 2 x 2
## Sunday_sales Store_count
## <dbl> <int>
## 1 0 1082
## 2 1 33
#--> 30 store has Sunday sales
# let add this atribute to the data set
rossmann <-rossmann%>%
dplyr::filter(DayOfWeek==7)%>%
group_by( Store)%>%
summarise(Sales = sum(Sales))%>%
arrange(Sales)%>%
ungroup()%>%
mutate(Sunday_sales = ifelse(Sales >0,1,0))%>%
select(-Sales)%>%
left_join(rossmann, by = 'Store')%>%
as.data.frame()
rossmann%>%filter(Store==13)%>%
ggplot(aes(x=Date, y = Sales))+geom_line()
# number of stores by missing data and no sunday sales group
rossmann%>%
select(missing,Sunday_sales, Store)%>%
unique()%>%
group_by(missing,Sunday_sales)%>%
summarise(store_count = n())
## # A tibble: 4 x 3
## # Groups: missing [?]
## missing Sunday_sales store_count
## <dbl> <dbl> <int>
## 1 0 0 903
## 2 0 1 32
## 3 1 0 179
## 4 1 1 1
Convert second promotion to numerical variables, drop such variables as: PromoInterval,Promo2SinceYear,Promo2SinceWeek, CompetitionOpenSinceYear,CompetitionOpenSinceMonth.
rossmann <-rossmann%>%
group_by(Store)%>%
mutate(date_since_promo2 = Date -as.Date(paste(Promo2SinceYear,ifelse(ceiling(Promo2SinceWeek/4)<10,
paste0(0,ceiling(Promo2SinceWeek/4)),ceiling(Promo2SinceWeek/4)),
'01',sep = '-'),format = '%Y-%m-%d'))%>%
mutate(date_since_promo2 = ifelse(is.na(date_since_promo2)==TRUE,0,date_since_promo2))%>%
mutate(promo2_month = ifelse(date_since_promo2>0 & PromoInterval == 'Feb_May_Aug_Nov' & Month %in% c(2,5,8,11),1,
ifelse(date_since_promo2>0 & PromoInterval == 'Jan_Apr_Jul_Oct' & Month %in% c(1,4,7,10),1,
ifelse(date_since_promo2>0 & PromoInterval == 'Mar_Jun_Sept_Dec' & Month %in% c(3,6,9,12),1,0))))
# drop some columns we have used
rossmann<-dplyr::select(rossmann, -c(PromoInterval,Promo2SinceYear,Promo2SinceWeek))
#--date since competitor starts
rossmann <-rossmann%>%
group_by(Store)%>%
mutate(date_since_comp_op = Date -as.Date(paste(CompetitionOpenSinceYear,ifelse(CompetitionOpenSinceMonth<10,paste0(0,CompetitionOpenSinceMonth),
CompetitionOpenSinceMonth),'01',sep = '-'),format = '%Y-%m-%d'))
# drop some columns we have used
rossmann<-dplyr::select(rossmann, -c(CompetitionOpenSinceYear,CompetitionOpenSinceMonth))
With expectation to get better results we add new variabels - average Sales and average Number of Customers per quarter, half year and last year.
window <- 6*7
quarter <- 120
half_yr <- 150
last_yr <- 365
rossmann<-rossmann%>%
group_by(Store)%>%
mutate(qtr_S=c(rep(NA,quarter-1),
rollmean(shift(Sales,window),
k= quarter,
align = 'left')),
halfyr_S=c(rep(NA,half_yr-1),
rollmean(shift(Sales,window),
k= half_yr,
align = 'left')),
yr_S=c(rep(NA,last_yr-1),
rollmean(shift(Sales,window),
k= last_yr,
align = 'left')),
qtr_Cs=c(rep(NA,quarter-1),
rollmean(shift(Customers,window),
k= quarter,
align = 'left')),
halfyr_Cs=c(rep(NA,half_yr-1),
rollmean(shift(Customers,window),
k= half_yr,
align = 'left')),
yr_Cs=c(rep(NA,last_yr-1),
rollmean(shift(Customers,window),
k= last_yr,
align = 'left')))
What’s more, it seemed interesting to check, for examle, if the day since summer will be signigficat.
rossmann<-rossmann%>%
group_by(Store, Year)%>%
mutate(DayssinceSm = Date -as.Date(paste0(Year,"-06-15"),
"%Y-%m-%d"))%>%
mutate(DayssinceSm = ifelse(DayssinceSm>0,DayssinceSm,0))
An of course we check missings and remove them.
# Remove missings
sum(is.na(rossmann))
## [1] 2016330
rossmann <- rossmann[complete.cases(rossmann), ]
# Change the type of some variables
rossmann$DayOfWeek <- as.factor(rossmann$DayOfWeek)
rossmann$StoreType <- as.factor(rossmann$StoreType)
rossmann$Assortment <- as.factor(rossmann$Assortment)
rossmann$Store <- as.numeric(rossmann$Store)
rossmann$Sunday_sales <- as.numeric(rossmann$Sunday_sales)
rossmann$Month <- as.numeric(rossmann$Month)
rossmann$Year <- as.numeric(rossmann$Year)
rossmann$Sales <- as.numeric(rossmann$Sales)
rossmann$Customers <- as.numeric(rossmann$Customers)
rossmann$Promo <- as.numeric(rossmann$Promo)
rossmann$date_since_comp_op <- as.numeric(rossmann$date_since_comp_op)
Let’s split data into train and test data sets and and perform exploratory data analysis based on the train dataset.
# split data into train and test datasets
split_date <-max(rossmann$Date) - window
#
train <- rossmann%>%
group_by(Store)%>%
filter(Date < split_date)
test <- rossmann%>%
group_by(Store)%>%
filter(Date > split_date)
The distribution of Sales are right skewed. It means that mean Sales per store is higher than its median.
hist(train$Sales, 100) # Sales
Have a look into the distribution of Customer.
hist(train$Customers, 100)
hist(train$CompetitionDistance, 100)
Sales is as expected strongly correlated with the number of customers. It looks like the Boxplots of customers overlap a little more than the boxplots of sales. This would mean that the promos are not mainly attracting more customers but make customers spend more. The mean amount spent per customer is about one Euro higher:
There are sometimes promos while the respective store is closed and there are promos 45% of the time:
table(ifelse(train$Sales != 0, "Sales > 0", "Sales = 0"),
ifelse(train$Promo, "Promo", "No promo"))
##
## No promo Promo
## Sales = 0 56003 4898
## Sales > 0 157785 133486
We remove variabels Date, Week, Store, Customers beacause
# Delete not used columns
train$Date <- NULL
train$Store <- NULL
train$Customers <- NULL
train$Week <- NULL
train$missing <- NULL
train$promo2_month <- NULL
test$Date <- NULL
test$Store <- NULL
test$Customers <- NULL
test$Week <- NULL
test$missing <- NULL
test$promo2_month <- NULL
Separate objects including a vector of potential predictors by type.
categorical_vars <- c("DayOfWeek", "StateHoliday", "StoreType", "Assortment")
continuous_vars <- c("Sunday_sales", "Open", "Promo", "SchoolHoliday",
"CompetitionDistance", "Promo2", "Month", "Year", "date_since_promo2",
"date_since_comp_op", "qtr_S", "halfyr_S", "yr_S", "qtr_Cs", "halfyr_Cs", "yr_Cs", "DayssinceSm")
variables <- c("DayOfWeek", "StateHoliday", "StoreType", "Assortment",
"Sunday_sales", "Open", "Promo", "SchoolHoliday",
"CompetitionDistance", "Promo2", "Month", "Year", "date_since_promo2",
"date_since_comp_op", "qtr_S", "halfyr_S", "yr_S", "qtr_Cs", "halfyr_Cs", "yr_Cs", "DayssinceSm")
depend <- "Sales"
We check the correlation using Pearson method.
library(corrplot)
## corrplot 0.84 loaded
corr = cor(as.data.frame(train)[continuous_vars], use = 'complete.obs', method = 'pearson')
corrplot(corr, method="circle")
corrplot(corr, method="number")
As we can obsserve, there are some variables wich are highly correlated (>0.8). We don’t remove them, cause in the Machine Learning Methods it’s not so significant.
To check which variabels are the most important Step-wise Regression is used.
base.mod <- lm(Sales ~ 1 , data= train) # base intercept only model
all.mod <- lm(Sales ~ . , data= train) # full model with all predictors
Almost all variables are significant (p-value < 0.05). Only StoreType is insignificant (p-value>0.05). Multiple R-squared is the same as Adjusted R-squred = 0.8286. It means that 82.86% of changes in Sales are interpreted by the model.
stepMod <- step(base.mod, scope = list(lower = base.mod, upper = all.mod), direction = "both", trace = 0, steps = 1000) # perform step-wise algorithm
shortlistedVars <- names(unlist(stepMod[[1]])) # get the shortlisted variable.
shortlistedVars <- shortlistedVars[!shortlistedVars %in% "(Intercept)"] # remove intercept
print(shortlistedVars)
## [1] "Open" "yr_S" "Promo"
## [4] "DayOfWeek2" "DayOfWeek3" "DayOfWeek4"
## [7] "DayOfWeek5" "DayOfWeek6" "DayOfWeek7"
## [10] "Month" "qtr_S" "Sunday_sales"
## [13] "StateHolidaya" "StateHolidayb" "StateHolidayc"
## [16] "SchoolHoliday" "Year" "halfyr_S"
## [19] "halfyr_Cs" "qtr_Cs" "yr_Cs"
## [22] "DayssinceSm" "Assortmentb" "Assortmentc"
## [25] "Promo2" "date_since_promo2" "StoreTypeb"
## [28] "StoreTypec" "StoreTyped" "CompetitionDistance"
## [31] "date_since_comp_op"
It looks, like variables “Open”, “yr_S”, “Promo”, “DayOfWeek2”, “DayOfWeek3” are the most important variables.
source('regressionMetrics.R')
model1 <- lm(Sales ~ .,
data = train)
summary(model1)
##
## Call:
## lm(formula = Sales ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13365.7 -936.1 -130.3 785.3 24886.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.969e+05 1.426e+04 -20.830 < 2e-16 ***
## Sunday_sales -7.167e+02 2.396e+01 -29.906 < 2e-16 ***
## DayOfWeek2 -1.027e+03 1.026e+01 -100.047 < 2e-16 ***
## DayOfWeek3 -1.463e+03 1.039e+01 -140.853 < 2e-16 ***
## DayOfWeek4 -1.404e+03 1.037e+01 -135.350 < 2e-16 ***
## DayOfWeek5 -1.098e+03 1.034e+01 -106.274 < 2e-16 ***
## DayOfWeek6 -1.095e+03 1.110e+01 -98.602 < 2e-16 ***
## DayOfWeek7 -3.409e+02 3.919e+01 -8.698 < 2e-16 ***
## Open 6.601e+03 3.858e+01 171.128 < 2e-16 ***
## Promo 2.213e+03 6.523e+00 339.334 < 2e-16 ***
## StateHolidaya -3.040e+02 4.191e+01 -7.253 4.07e-13 ***
## StateHolidayb -1.081e+03 4.843e+01 -22.313 < 2e-16 ***
## StateHolidayc 2.178e+02 6.021e+01 3.617 0.000298 ***
## SchoolHoliday 2.211e+02 7.903e+00 27.973 < 2e-16 ***
## StoreTypeb -1.538e+02 3.989e+01 -3.856 0.000115 ***
## StoreTypec -5.853e+00 8.164e+00 -0.717 0.473447
## StoreTyped 3.542e+00 7.354e+00 0.482 0.630088
## Assortmentb 4.585e+02 5.570e+01 8.232 < 2e-16 ***
## Assortmentc 2.355e+01 6.173e+00 3.816 0.000136 ***
## CompetitionDistance -1.059e-03 3.878e-04 -2.731 0.006308 **
## Promo2 8.775e+01 8.538e+00 10.277 < 2e-16 ***
## Month 5.712e+01 2.312e+00 24.709 < 2e-16 ***
## Year 1.446e+02 7.075e+00 20.434 < 2e-16 ***
## date_since_promo2 -5.734e-02 6.689e-03 -8.573 < 2e-16 ***
## date_since_comp_op 3.250e-03 1.306e-03 2.489 0.012814 *
## qtr_S -8.201e-01 4.905e-02 -16.719 < 2e-16 ***
## halfyr_S 2.047e+00 5.902e-02 34.680 < 2e-16 ***
## yr_S -2.137e-01 2.687e-02 -7.951 1.85e-15 ***
## qtr_Cs 1.402e+01 5.169e-01 27.126 < 2e-16 ***
## halfyr_Cs -1.905e+01 6.044e-01 -31.521 < 2e-16 ***
## yr_Cs 4.788e+00 2.359e-01 20.298 < 2e-16 ***
## DayssinceSm 1.193e+00 1.164e-01 10.251 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1617 on 352140 degrees of freedom
## Multiple R-squared: 0.8286, Adjusted R-squared: 0.8286
## F-statistic: 5.492e+04 on 31 and 352140 DF, p-value: < 2.2e-16
model1.pred <- predict(model1, new=test)
summary(model1.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3432 4664 6462 6178 8159 23877
Almost all variables are significant (p-value < 0.05). Only StoreType is insignificant (p-value>0.05). Multiple R-squared is the same as Adjusted R-squred = 0.8286. It means that 82.86% of changes in Sales are interpreted by the model.
# test dataset
LNModel <- regressionMetrics(real = test$Sales,
predicted = model1.pred)
LNModel
## MSE RMSE MAE MedAE R2
## 1 2155991 1468.329 1062.919 820.1402 0.8460609
84.60 % of variation in Sales is described by LNModel. Root mean square deviation is equal to 1468.329.
model1A <- lm(Sales ~ Sunday_sales + DayOfWeek + Open + Promo + StateHoliday +
Assortment + CompetitionDistance + Promo2 + Month + Year + date_since_promo2 +
date_since_comp_op + qtr_S + halfyr_S + yr_S + qtr_Cs + halfyr_Cs +
yr_Cs + DayssinceSm,
data = train)
summary(model1A)
##
## Call:
## lm(formula = Sales ~ Sunday_sales + DayOfWeek + Open + Promo +
## StateHoliday + Assortment + CompetitionDistance + Promo2 +
## Month + Year + date_since_promo2 + date_since_comp_op + qtr_S +
## halfyr_S + yr_S + qtr_Cs + halfyr_Cs + yr_Cs + DayssinceSm,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13390.6 -937.9 -134.0 781.9 24855.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.880e+05 1.426e+04 -20.192 < 2e-16 ***
## Sunday_sales -7.650e+02 1.997e+01 -38.314 < 2e-16 ***
## DayOfWeek2 -1.019e+03 1.027e+01 -99.205 < 2e-16 ***
## DayOfWeek3 -1.456e+03 1.040e+01 -140.040 < 2e-16 ***
## DayOfWeek4 -1.399e+03 1.038e+01 -134.692 < 2e-16 ***
## DayOfWeek5 -1.094e+03 1.035e+01 -105.754 < 2e-16 ***
## DayOfWeek6 -1.138e+03 1.100e+01 -103.443 < 2e-16 ***
## DayOfWeek7 -4.043e+02 3.911e+01 -10.338 < 2e-16 ***
## Open 6.582e+03 3.854e+01 170.775 < 2e-16 ***
## Promo 2.204e+03 6.521e+00 337.943 < 2e-16 ***
## StateHolidaya -3.267e+02 4.189e+01 -7.799 6.25e-15 ***
## StateHolidayb -9.443e+02 4.815e+01 -19.611 < 2e-16 ***
## StateHolidayc 3.600e+02 5.998e+01 6.001 1.97e-09 ***
## Assortmentb 3.996e+02 5.165e+01 7.737 1.02e-14 ***
## Assortmentc 2.672e+01 6.017e+00 4.441 8.96e-06 ***
## CompetitionDistance -8.349e-04 3.846e-04 -2.171 0.0299 *
## Promo2 8.903e+01 8.545e+00 10.419 < 2e-16 ***
## Month 6.312e+01 2.304e+00 27.398 < 2e-16 ***
## Year 1.402e+02 7.080e+00 19.799 < 2e-16 ***
## date_since_promo2 -5.815e-02 6.690e-03 -8.693 < 2e-16 ***
## date_since_comp_op 3.151e-03 1.306e-03 2.413 0.0158 *
## qtr_S -7.246e-01 4.898e-02 -14.792 < 2e-16 ***
## halfyr_S 1.895e+00 5.883e-02 32.211 < 2e-16 ***
## yr_S -1.551e-01 2.682e-02 -5.783 7.34e-09 ***
## qtr_Cs 1.345e+01 5.170e-01 26.006 < 2e-16 ***
## halfyr_Cs -1.817e+01 6.041e-01 -30.077 < 2e-16 ***
## yr_Cs 4.456e+00 2.357e-01 18.908 < 2e-16 ***
## DayssinceSm 9.728e-01 1.162e-01 8.371 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1619 on 352144 degrees of freedom
## Multiple R-squared: 0.8282, Adjusted R-squared: 0.8282
## F-statistic: 6.289e+04 on 27 and 352144 DF, p-value: < 2.2e-16
Exculed variable StoryType decreases the R2 from 82.86 % to 82.82% on train dataset. The change is little, but also worthed to be annouced. In model1A all variables are significant (p-value <0.05). Let’s make the prediction on test data set.
model1A.pred <- predict(model1A, new=test)
LNModelA <- regressionMetrics(real = test$Sales,
predicted = model1A.pred)
LNModelA# 84.73 %
## MSE RMSE MAE MedAE R2
## 1 2137778 1462.114 1050.992 805.5585 0.8473613
After removing insignificant variable our result is a little bit improved comparing to model1. 84.73 % of changes in Sales are predicted by the model.
Investigating futher, we check how good is model with only 5 most significant variabels after Step-wise regression.
model1B <- lm(Sales ~ Open + yr_S + Promo + DayOfWeek,
data = train)
summary(model1B)
##
## Call:
## lm(formula = Sales ~ Open + yr_S + Promo + DayOfWeek, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14030.8 -956.3 -139.0 795.0 25110.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.406e+03 1.765e+01 -306.30 <2e-16 ***
## Open 6.812e+03 1.492e+01 456.64 <2e-16 ***
## yr_S 9.744e-01 1.334e-03 730.38 <2e-16 ***
## Promo 2.212e+03 6.609e+00 334.65 <2e-16 ***
## DayOfWeek2 -9.944e+02 1.043e+01 -95.38 <2e-16 ***
## DayOfWeek3 -1.434e+03 1.043e+01 -137.49 <2e-16 ***
## DayOfWeek4 -1.391e+03 1.039e+01 -133.78 <2e-16 ***
## DayOfWeek5 -1.102e+03 1.043e+01 -105.67 <2e-16 ***
## DayOfWeek6 -1.127e+03 1.110e+01 -101.49 <2e-16 ***
## DayOfWeek7 -1.868e+02 1.741e+01 -10.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1649 on 352162 degrees of freedom
## Multiple R-squared: 0.8219, Adjusted R-squared: 0.8219
## F-statistic: 1.806e+05 on 9 and 352162 DF, p-value: < 2.2e-16
The results for model1B on train data set are the worst comparing to two previouse models. R2 is equal to 82.19 %.
model1B.pred <- predict(model1B, new=test)
# test dataset
LNModelB <- regressionMetrics(real = test$Sales,
predicted = model1B.pred) # 84.75 %
LNModelB
## MSE RMSE MAE MedAE R2
## 1 2134447 1460.975 1039.168 783.5543 0.8475992
What’s interesting, that R2 on train data set is lower but on test data set is bigger than in previouse two models, but still difference isn’t too big. 84.75 % of changes in Sales are predicted with a model1B. RMSE is smaller than in model1 and model1A based on test data set.
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 1.4.2 ✔ stringr 1.3.1
## ✔ purrr 0.2.5 ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'purrr' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4
## Warning: package 'forcats' was built under R version 3.4.3
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ data.table::between() masks dplyr::between()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks timeSeries::filter(), stats::filter()
## ✖ data.table::first() masks dplyr::first(), xts::first()
## ✖ data.table::hour() masks lubridate::hour()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ data.table::isoweek() masks lubridate::isoweek()
## ✖ dplyr::lag() masks timeSeries::lag(), stats::lag()
## ✖ data.table::last() masks dplyr::last(), xts::last()
## ✖ data.table::mday() masks lubridate::mday()
## ✖ data.table::minute() masks lubridate::minute()
## ✖ data.table::month() masks lubridate::month()
## ✖ .GlobalEnv::quarter() masks data.table::quarter(), lubridate::quarter()
## ✖ data.table::second() masks lubridate::second()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ lubridate::union() masks base::union()
## ✖ data.table::wday() masks lubridate::wday()
## ✖ data.table::week() masks lubridate::week()
## ✖ data.table::yday() masks lubridate::yday()
## ✖ data.table::year() masks lubridate::year()
## Warning: package 'MASS' was built under R version 3.4.4
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Warning: package 'tree' was built under R version 3.4.4
## Warning: package 'caret' was built under R version 3.4.4
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.4.4
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## Warning: package 'rpart' was built under R version 3.4.3
## Warning: package 'rpart.plot' was built under R version 3.4.4
## Warning: package 'rattle' was built under R version 3.4.4
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
Let’s run regression tree on all predictors on train data set.
model2 <- Sales ~ .
Rossmann.tree <- tree(model2 ,train)
summary(Rossmann.tree)
##
## Regression tree:
## tree(formula = model2, data = train)
## Variables actually used in tree construction:
## [1] "Open" "qtr_S" "Promo" "halfyr_S" "yr_S"
## Number of terminal nodes: 8
## Residual mean deviance: 2800000 = 9.861e+11 / 352200
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11000.0 -861.1 0.0 0.0 576.8 27240.0
Only 5 variables are seleted in tree construction: Open, qtr_S, Promo, halfyr_S, yr_S
Let’s visualise the tree on the plot.
plot(Rossmann.tree)
text(Rossmann.tree, pretty = 0)
Number of terminal nodes: 8. Variables Open, Promo and yr_S are significant both Regression Tree and Step-wise Regression.
We apply cross validation with 10 folds to access appropriate tree size.
Rossmann.cv <- cv.tree(Rossmann.tree, K = 10)
plot(Rossmann.cv$size, Rossmann.cv$dev, type = 'b')
WE assume that the number of that the appropriate number of final nodes is 5 (terminal noods) and then prunned tree on the plot.
Rossmann.prune <- prune.tree(Rossmann.tree, best = 5)
plot(Rossmann.prune)
text(Rossmann.prune, pretty = 0)
And finally we can build and visualise the prediction.
model2.pred <- predict(Rossmann.tree, test)
# visualising prediction
Rossmann.test1 <- test$Sales
plot(model2.pred, Rossmann.test1)
abline(0, 1)
# mean square of prediction error
RegressTree <- regressionMetrics(real = test$Sales,
predicted = model2.pred)
RegressTree
## MSE RMSE MAE MedAE R2
## 1 2393061 1546.952 1015.954 712.9639 0.8291339
The prediction is worther than in case of Linear Prediction. R2 is equal to 82.91 % and RMSE is 1546.952.
model2B <- Sales ~ Open + yr_S + Promo + DayOfWeek
Rossmann.treeB <- tree(model2B ,train)
Rossmann.treeB
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 352172 5.374e+12 5844
## 2) Open < 0.5 60877 0.000e+00 0 *
## 3) Open > 0.5 291295 2.860e+12 7065
## 6) yr_S < 6516.19 207355 8.767e+11 5895
## 12) Promo < 0.5 112198 2.863e+11 4937
## 24) yr_S < 4625.83 48395 6.152e+10 3903 *
## 25) yr_S > 4625.83 63803 1.338e+11 5721 *
## 13) Promo > 0.5 95157 3.660e+11 7025
## 26) yr_S < 4804.01 46163 9.400e+10 5825 *
## 27) yr_S > 4804.01 48994 1.431e+11 8154 *
## 7) yr_S > 6516.19 83940 9.978e+11 9957
## 14) yr_S < 10099.9 72638 5.075e+11 9222
## 28) Promo < 0.5 39382 1.671e+11 7889 *
## 29) Promo > 0.5 33256 1.875e+11 10800 *
## 15) yr_S > 10099.9 11302 1.987e+11 14680 *
summary(Rossmann.treeB)
##
## Regression tree:
## tree(formula = model2B, data = train)
## Variables actually used in tree construction:
## [1] "Open" "yr_S" "Promo"
## Number of terminal nodes: 8
## Residual mean deviance: 2799000 = 9.858e+11 / 352200
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11000.0 -862.7 0.0 0.0 578.6 27240.0
plot(Rossmann.treeB)
text(Rossmann.treeB, pretty = 0)
It looks like Open, yr_S, Promo are the most significant variabels.
As previously we apply cross-validation with 10 folds to access appropriate tree size.
Rossmann.cvB <- cv.tree(Rossmann.treeB, K = 10)
plot(Rossmann.cvB$size, Rossmann.cv$devB, type = 'b')
And assume that the appropriate number of final nodes is 5 (terminal noods).
Rossmann.pruneB <- prune.tree(Rossmann.treeB, best = 5)
Let’s prunne tree on the plot and finally build the prediction
model2B.pred <- predict(Rossmann.treeB, test)
# visualising prediction
Rossmann.test1 <- test$Sales
plot(model2B.pred, Rossmann.test1)
abline(0, 1)
# mean square of prediction error
RegressTreeB <- regressionMetrics(real = test$Sales,
predicted = model2B.pred)
RegressTreeB
## MSE RMSE MAE MedAE R2
## 1 2420245 1555.714 1029.322 728 0.827193
R2 is 82.71 % and RMSE is equal to 1555.714. These results are better comparing with Regression tree with all varaibeles and still worther than in simple Regression.
We feet the model based on training data set.
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.4.4
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
##
## xgboost
## The following object is masked from 'package:dplyr':
##
## slice
set.seed(123324)
training <- F
if (training){
model3 <- train(
Sales ~., data = train, method = "xgbTree",
trControl = trainControl("cv", number = 10)
)
model3$bestTune
# And than let's make prediction on the test data set.
predictions <- model3 %>% predict(test)
head(predictions)
saveRDS(predicitons, file ='predictions.Rds')
XGBoost <- regressionMetrics(real = test$Sales,
predicted = predictions)
XGBoost
}
In the purpose of father analyses we need to create two objects:
y for storing the outcome variable x for holding the predictor variables
library(tidyverse)
library(caret)
#install.packages("glmnet")
library(glmnet)
## Warning: package 'glmnet' was built under R version 3.4.4
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.4.4
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.4.3
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded glmnet 2.0-16
# Predictor variables
x <- model.matrix(Sales~., train)[,-1]
# Outcome variable
y <- train$Sales
In penalized regression, you have to specify a constant lambda to adjust the amount of the coefficient shrinkage. The best lambda for the data is defined as the lambda that minimize the cross-validation prediction error rate. This is determined automatically using the function cv.glmnet().
Alpha is equal to 0, because this value is defined for Ridge Regression.
# Find the best lambda using cross-validation
set.seed(123)
cv <- cv.glmnet(x, y, alpha = 0)
cv$lambda.min
## [1] 293.2092
In our case the minimum lambda is 293.2092.
# Fit the final model on the training data
model5 <- glmnet(x, y, alpha = 0, lambda = cv$lambda.min)
# Display regression coefficients
coef(model5)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -2.819662e+05
## Sunday_sales -4.385642e+02
## DayOfWeek2 -5.599388e+02
## DayOfWeek3 -9.824149e+02
## DayOfWeek4 -9.500882e+02
## DayOfWeek5 -6.663020e+02
## DayOfWeek6 -7.028513e+02
## DayOfWeek7 -2.457056e+03
## Open 3.917230e+03
## Promo 2.101716e+03
## StateHolidaya -2.618597e+03
## StateHolidayb -3.248947e+03
## StateHolidayc -2.309552e+03
## SchoolHoliday 2.023427e+02
## StoreTypeb -1.877237e+02
## StoreTypec -2.463012e+01
## StoreTyped 9.874562e+01
## Assortmentb -1.747349e+02
## Assortmentc 8.238457e+01
## CompetitionDistance 1.402994e-03
## Promo2 7.312756e+01
## Month 5.377921e+01
## Year 1.383517e+02
## date_since_promo2 -2.334723e-02
## date_since_comp_op 4.994841e-03
## qtr_S 3.026933e-01
## halfyr_S 2.944394e-01
## yr_S 2.968929e-01
## qtr_Cs 2.287218e-01
## halfyr_Cs 1.555869e-01
## yr_Cs 2.056544e-01
## DayssinceSm 1.007761e+00
Let’s make predictions on the test data
x.test <- model.matrix(Sales ~., test)[,-1]
model5.pred <- model5 %>% predict(x.test) %>% as.vector()
RidgeRegress_1 <- regressionMetrics(real = test$Sales,
predicted = model5.pred)
RidgeRegress_1
## MSE RMSE MAE MedAE R2
## 1 2232567 1494.178 1060.956 799.758 0.8405933
R2 is equal 84.06% and RMSE is 1494.178 - these results are quite similar to the results from linear regression.
The only difference between the R code used for ridge regression is that, for lasso regression you need to specify the argument alpha = 1 instead of alpha = 0 (for ridge regression). As in previouse case, we find lambda using cross-validation.
set.seed(12345)
cv <- cv.glmnet(x, y, alpha = 1)
cv$lambda.min
## [1] 2.734479
As you can see, the best lambda is eqault to 2.734479.
Let’s fit the final model on the training data and display regression coefficients.
model6 <- glmnet(x, y, alpha = 1, lambda = cv$lambda.min)
coef(model6)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -3.128304e+05
## Sunday_sales -7.173064e+02
## DayOfWeek2 -9.734598e+02
## DayOfWeek3 -1.421121e+03
## DayOfWeek4 -1.367403e+03
## DayOfWeek5 -1.061747e+03
## DayOfWeek6 -1.064592e+03
## DayOfWeek7 -2.204128e+02
## Open 6.694886e+03
## Promo 2.208735e+03
## StateHolidaya -1.700022e+02
## StateHolidayb -9.436744e+02
## StateHolidayc 3.076658e+02
## SchoolHoliday 1.967546e+02
## StoreTypeb -1.546746e+02
## StoreTypec .
## StoreTyped 1.212009e+01
## Assortmentb 2.954582e+02
## Assortmentc 2.449938e+01
## CompetitionDistance -5.785099e-05
## Promo2 7.412429e+01
## Month 7.459945e+01
## Year 1.523642e+02
## date_since_promo2 -3.845303e-02
## date_since_comp_op 2.334481e-03
## qtr_S 5.001842e-01
## halfyr_S 2.448082e-01
## yr_S 2.597846e-01
## qtr_Cs .
## halfyr_Cs -1.590857e-01
## yr_Cs .
## DayssinceSm 2.255933e-01
Make predictions on the test data and check results.
x.test2 <- model.matrix(Sales ~., test)[,-1]
model6.pred <- model6 %>% predict(x.test2) %>% as.vector()
LassoRegress_1 <- regressionMetrics(real = test$Sales,
predicted = model6.pred)
LassoRegress_1
## MSE RMSE MAE MedAE R2
## 1 2164143 1471.103 1064.469 823.8259 0.8454788
R2 is 84.54 % and RMSE is equal to 1471.103. The results are better than in simple linear regression and regression tree. So let’s investigate futher.
In futher investigation we are interested to compute and compare ridge and lasso regression using the caret workflow. Caret will automatically choose the best tuning parameter values, compute the final model and evaluate the model performance using cross-validation techniques.
We setup a grid range of lambda values and compute ridge regression.
lambda <- 10^seq(-3, 3, length = 100)
set.seed(1235)
ridge <- train(
Sales ~., data = train, method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneGrid = expand.grid(alpha = 0, lambda = lambda)
)
coef(ridge$finalModel, ridge$bestTune$lambda)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -2.847309e+05
## Sunday_sales -4.403558e+02
## DayOfWeek2 -5.925777e+02
## DayOfWeek3 -1.017446e+03
## DayOfWeek4 -9.829249e+02
## DayOfWeek5 -6.975017e+02
## DayOfWeek6 -7.317390e+02
## DayOfWeek7 -2.450214e+03
## Open 3.968343e+03
## Promo 2.110474e+03
## StateHolidaya -2.593819e+03
## StateHolidayb -3.237227e+03
## StateHolidayc -2.272014e+03
## SchoolHoliday 2.020036e+02
## StoreTypeb -1.722348e+02
## StoreTypec -2.285709e+01
## StoreTyped 9.012787e+01
## Assortmentb -1.138023e+02
## Assortmentc 7.748702e+01
## CompetitionDistance 1.187428e-03
## Promo2 7.357451e+01
## Month 5.488974e+01
## Year 1.397040e+02
## date_since_promo2 -2.582179e-02
## date_since_comp_op 4.897321e-03
## qtr_S 3.105107e-01
## halfyr_S 3.001296e-01
## yr_S 2.953491e-01
## qtr_Cs 1.872586e-01
## halfyr_Cs 1.083612e-01
## yr_Cs 2.116699e-01
## DayssinceSm 9.795607e-01
Finally, we make prediciton and check the results.
model8 <- ridge %>% predict(test)
RidgeRegress_2 <- regressionMetrics(real = test$Sales,
predicted = model8)
RidgeRegress_2
## MSE RMSE MAE MedAE R2
## 1 2224438 1491.455 1060.239 802.0917 0.8411737
R2 is 84.11 %, what is better than in case of Ridge Regression using cross-validation method but worther than LASSO Regression (84.54%). In Ridge REgression using caret package RMSE is equal to 1491.455 and lower than in Ridge Regression using cross-validation (1494.178) but is higher than in Lasso Regression using cross-validation.
set.seed(123)
lasso <- train(
Sales ~., data = train, method = "glmnet",
trControl = trainControl("cv", number = 10),
tuneGrid = expand.grid(alpha = 1, lambda = lambda)
)
coef(lasso$finalModel, lasso$bestTune$lambda)
## 32 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -3.204719e+05
## Sunday_sales -7.308101e+02
## DayOfWeek2 -9.723828e+02
## DayOfWeek3 -1.416511e+03
## DayOfWeek4 -1.364407e+03
## DayOfWeek5 -1.058877e+03
## DayOfWeek6 -1.061828e+03
## DayOfWeek7 -1.626491e+02
## Open 6.750595e+03
## Promo 2.209169e+03
## StateHolidaya -1.141447e+02
## StateHolidayb -8.892639e+02
## StateHolidayc 3.737920e+02
## SchoolHoliday 1.950170e+02
## StoreTypeb -1.529149e+02
## StoreTypec .
## StoreTyped 1.485219e+01
## Assortmentb 2.972015e+02
## Assortmentc 2.988212e+01
## CompetitionDistance -3.772743e-06
## Promo2 7.540055e+01
## Month 7.550407e+01
## Year 1.561285e+02
## date_since_promo2 -3.946825e-02
## date_since_comp_op 2.736665e-03
## qtr_S 5.337721e-01
## halfyr_S 1.511060e-01
## yr_S 3.178800e-01
## qtr_Cs .
## halfyr_Cs -1.433309e-01
## yr_Cs .
## DayssinceSm 1.334948e-01
Let’s make prediction and check the results.
model9 <- lasso %>% predict(test)
LassoRegress_2 <- regressionMetrics(real = test$Sales,
predicted = model9)
LassoRegress_2
## MSE RMSE MAE MedAE R2
## 1 2166087 1471.763 1066.168 824.8719 0.8453401
The results are better than we have in Ridge and Lasso Regressio using cross-validation technik. R2 is 84.53% and RMSE is 1471.763.
# only select few variales for our analysi
rossmann1 <- rossmann1%>%
dplyr::select(Store,Date,DayOfWeek,Month,Year,Sales,Customers)%>%
arrange(Store,Date)
# add number of observation as a type of store to indicate which the store
# which has misisng data
rossmann1 <-rossmann1%>%
group_by(Store)%>%
summarise(days_count = n())%>%
ungroup()%>%
mutate(missing = ifelse(days_count==758,1,0))%>%
dplyr::select(-days_count)%>%
left_join(rossmann1, by = 'Store')
# let add this atribute to the data set
rossmann1 <-rossmann1%>%
filter(DayOfWeek==7)%>%
group_by(Store)%>%
summarise(Sales = sum(Sales))%>%
arrange(Sales)%>%
ungroup()%>%
mutate(Sunday_sales = ifelse(Sales >0,1,0))%>%
dplyr::select(-Sales)%>%
left_join(rossmann1, by = 'Store')%>%
as.data.frame()
# number of stores by missing data and no sunday sales group
rossmann1%>%
dplyr::select(missing,Sunday_sales, Store, Sales)%>%
mutate(Sales = Sales/1000)%>% #avoid integer overflow
group_by(Store,missing,Sunday_sales)%>%
summarise(Sales = sum(Sales))%>%
group_by(Sunday_sales,missing)%>%
summarise(store_count = n(),
Sales = sum(Sales))%>%
ungroup()%>%
mutate(sales_pct = Sales/sum(Sales))
## # A tibble: 4 x 5
## Sunday_sales missing store_count Sales sales_pct
## <dbl> <dbl> <int> <dbl> <dbl>
## 1 0 0 903 4884698. 0.832
## 2 0 1 179 738256. 0.126
## 3 1 0 32 246121. 0.0419
## 4 1 1 1 4106. 0.000699
=> Its worth having two ARIMA models for Sunday sales and non-sunday sales. In addition, we need to take the missing data issue into consideration. For ARIMA model, we only use the last 6 months of the missing-data stores to estimate.
rossmann1%>%
filter(missing==0, Month %in% c(2,3,4))%>%
group_by(Date,Year,Sunday_sales)%>%
summarise(Sales = sum(Sales))%>%
ungroup()%>%
mutate(doy = yday(Date))%>%
ggplot(aes(x=doy, y= Sales))+geom_line()+
facet_wrap(Sunday_sales~Year,scales="free_y")+
ggtitle(label = 'seasonality of Feb,Mar,Apr pattern over three years Is_Sunday_Sales vs Year')
rossmann1%>%
filter(missing==0)%>%
mutate(Week = week(Date))%>%
group_by(Year,Week,Sunday_sales)%>%
summarise(Sales = sum(Sales))%>%
ggplot(aes(x=Week, y= Sales))+geom_line()+
facet_wrap(Sunday_sales~Year,scales="free_y")+
ggtitle(label = 'seasonality pattern over three years Is_Sunday_Sales vs Year')
# investigate the monthly seasonality of the data
rossmann1%>%
filter(missing==0)%>%
group_by(Year, Month)%>%
summarise(Sales = sum(Sales))%>%
ggplot(aes(x=Month, y= Sales))+geom_line()+
facet_free(Year~.)
There are few peaks:
+ March: ? + June: Summer starts + December: Xmas
Split the data into train and test set
window <- 6*7
split_date <-max(rossmann1$Date) - window
#
train1 <- rossmann1%>%
group_by(Store)%>%
filter(Date <=split_date)
test1 <- rossmann1%>%
group_by(Store)%>%
filter(Date > split_date)
# again split the train set into sunday sales group and non-sunday sales group
train1_nosun <- train1%>%
filter(missing==0, Sunday_sales==0, Sales >0)
train1_nosun_agg <- train1_nosun%>%
group_by(Date,Year, Month)%>%
summarise(Sales = mean(Sales))
train1_sun <- train1%>%
filter(missing==0, Sunday_sales==1, Sales >0)
train1_sun_agg <- train1_nosun%>%
group_by(Date,Year, Month)%>%
summarise(Sales = mean(Sales))
For each store we use estimate the parameter of the general model form using its historical data and make prediction. In other words, we loop through all stores to estimate the model’s parameter and forecast
acf(diff(train1_nosun_agg$Sales,12) ,lag.max =150, ylim = c(-1,1), lwd = 5,
col = "dark green",na.action = na.pass, main = 'ACF - 12 days (2 weeks) seasonal adjusted sales')
pacf(diff(train1_nosun_agg$Sales,12),lag.max = 150,lwd = 5,
col = "dark green",na.action = na.pass, main ='PACF - 12 days (2 weeks) seasonal adjusted sales')
plot(diff(train1_nosun_agg$Sales,12),
type = 'l',
main = 'Plot of 12 day seasonal adjusted Sales')
Its hard to say that the time series with 12 lags difference is stationary hopefully some additional lag season added can help to solve this problem Possible model: ARMA model with up to 12 lags + the first difference of the seasonal factor and its first few seasonal lags
source('custom_functions.R')
adf_test <-testdf(variable = diff(train1_nosun_agg$Sales,12),
max.augmentations = 12,max.order = 8,
plot_title='Plot of the diff(Sales_t, Sales_T-12)')
adf_test_rename(adf_test)
## agu p_adf p_bg.p1 p_bg.p2 p_bg.p3 p_bg.p4 p_bg.p5 p_bg.p6 p_bg.p7
## 1 0 0.01 0.056 0.000 0.000 0.000 0.000 0.000 0.000
## 2 1 0.01 0.896 0.988 0.924 0.029 0.011 0.005 0.002
## 3 2 0.01 0.962 0.928 0.984 0.022 0.011 0.006 0.002
## 4 3 0.01 0.801 0.969 0.918 0.305 0.075 0.027 0.006
## 5 4 0.01 0.721 0.937 0.985 0.980 0.973 0.903 0.160
## 6 5 0.01 0.911 0.963 0.995 0.999 0.999 1.000 0.652
## 7 6 0.01 0.971 0.999 0.998 1.000 0.949 0.960 0.929
## 8 7 0.01 0.977 0.998 1.000 0.994 0.945 0.959 0.853
## 9 8 0.01 0.921 0.993 0.998 0.977 0.899 0.927 0.862
## 10 9 0.01 0.928 0.577 0.728 0.860 0.875 0.858 0.812
## 11 10 0.01 0.764 0.561 0.678 0.822 0.853 0.864 0.812
## 12 11 0.01 0.580 0.175 0.322 0.470 0.615 0.700 0.798
## 13 12 0.01 0.789 0.192 0.347 0.499 0.640 0.726 0.821
## p_bg.p8
## 1 0.000
## 2 0.001
## 3 0.001
## 4 0.007
## 5 0.205
## 6 0.741
## 7 0.950
## 8 0.792
## 9 0.697
## 10 0.755
## 11 0.754
## 12 0.868
## 13 0.888
H0: the time series is NON-stationary => Reject the null hypothesis that diff(Sales_t, Sales_T-12) is non-stationary with 4 augmentation
pp_test <- ur.pp(diff(train1_nosun_agg$Sales,12),type = c("Z-tau"),model = c("trend"))
summary(pp_test)
##
## ##################################
## # Phillips-Perron Unit Root Test #
## ##################################
##
## Test regression with intercept and trend
##
##
## Call:
## lm(formula = y ~ y.l1 + trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9294.2 -416.3 6.8 533.7 5513.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.17125 55.72428 0.093 0.926
## y.l1 0.43116 0.03320 12.986 <2e-16 ***
## trend -0.04698 0.26050 -0.180 0.857
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1517 on 738 degrees of freedom
## Multiple R-squared: 0.1861, Adjusted R-squared: 0.1838
## F-statistic: 84.35 on 2 and 738 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic, type: Z-tau is: -17.6889
##
## aux. Z statistics
## Z-tau-mu 0.0779
## Z-tau-beta -0.1816
##
## Critical values for Z statistics:
## 1pct 5pct 10pct
## critical values -3.975159 -3.418083 -3.131177
Ho: time series is Non-stationary # tau = -19.489 < -3.438846 (1% critical value) =>reject the null about non-stationarity of diff(Sales_t, Sales_T-12)
kpss <- ur.kpss(diff(train_nosun_agg$Sales,12),type = c("mu")) # constant deterministic component
## Error in diff(train_nosun_agg$Sales, 12): object 'train_nosun_agg' not found
summary(kpss)
## Error in summary(kpss): object 'kpss' not found
H0: time series is stationary test statistic = 0.0109 < 0.463 (5% critical valye) => we cant reject the null about STATIONARITY of diff(Sales_t, Sales_T-12)
We configure the model mannually and come up with the best model
arima_best <- Arima(train1_nosun_agg$Sales,order = c(14,0,0),
seasonal=list(order=c(3,1,1),period=12),
include.constant = FALSE)
summary(arima_best)
## Series: train1_nosun_agg$Sales
## ARIMA(14,0,0)(3,1,1)[12]
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5 ar6 ar7 ar8
## 0.3619 0.1967 -0.0045 0.1051 -0.1896 -0.0456 -0.1096 0.0328
## s.e. 0.0364 0.0387 0.0390 0.0406 0.0401 0.0411 0.0404 0.0395
## ar9 ar10 ar11 ar12 ar13 ar14 sar1 sar2
## 0.1165 -0.0297 0.0721 -0.1794 -0.0788 0.1275 0.4906 0.0079
## s.e. 0.0400 0.0394 0.0390 0.1345 0.0590 0.0420 0.1449 0.0921
## sar3 sma1
## 0.1161 -1.0000
## s.e. 0.0385 0.0254
##
## sigma^2 estimated as 1468702: log likelihood=-6329.37
## AIC=12696.74 AICc=12697.79 BIC=12784.32
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 49.39993 1187.546 774.7905 -1.943759 11.3343 0.6757775
## ACF1
## Training set 0.002123919
coeftest(arima_best)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.3619035 0.0363633 9.9524 < 2.2e-16 ***
## ar2 0.1966896 0.0386960 5.0829 3.716e-07 ***
## ar3 -0.0045111 0.0389705 -0.1158 0.9078454
## ar4 0.1051428 0.0405750 2.5913 0.0095608 **
## ar5 -0.1895744 0.0400635 -4.7318 2.225e-06 ***
## ar6 -0.0455860 0.0411107 -1.1089 0.2674909
## ar7 -0.1095939 0.0404463 -2.7096 0.0067361 **
## ar8 0.0327651 0.0395156 0.8292 0.4070082
## ar9 0.1165062 0.0399790 2.9142 0.0035662 **
## ar10 -0.0296690 0.0394257 -0.7525 0.4517331
## ar11 0.0720647 0.0390268 1.8465 0.0648136 .
## ar12 -0.1793525 0.1344794 -1.3337 0.1823089
## ar13 -0.0787521 0.0590313 -1.3341 0.1821798
## ar14 0.1274817 0.0420117 3.0344 0.0024099 **
## sar1 0.4906213 0.1449486 3.3848 0.0007123 ***
## sar2 0.0078645 0.0920640 0.0854 0.9319243
## sar3 0.1161093 0.0385295 3.0135 0.0025824 **
## sma1 -0.9999946 0.0253807 -39.3998 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Although there are some insignificant intermediate lags, we decided to keep up to 14th lags as it improve RMSE and BIC alot.
Box.test(resid(arima_best), type = "Ljung-Box",lag = 7)
##
## Box-Ljung test
##
## data: resid(arima_best)
## X-squared = 0.57685, df = 7, p-value = 0.9991
Reject the null hypothesis that there is auto-correlation in the data
plot_acf_pacf(arima_best)
There is no significant lags indicating there is auto-correlation amongs lags
we need to loop through all store to estimate the ARIMA coefficient for each store and to 6 weeks ahead forecast
store_list <- unique(rossmann1$Store)
test_date <- test1[test1$Store==1,]$Date
predict_df <- data.frame()
training <- FALSE
if (training){
for (i in 1:length(store_list)){#
print(i)
store_i <- train1%>%filter(Store==store_list[i])
# if the data is misisng, only use data from last couple months
if (store_i$missing[1]==1){
store_i <- store_i%>%filter(Date > "2015-01-01")
}
# no_sunday sales model estimation
if (store_i$Sunday_sales[1]==0){
store_i <- store_i%>%filter(DayOfWeek !=7)
tryCatch(
#expr
{
model_i <- Arima(store_i[,'Sales'],
order = c(14,0,0),
seasonal=list(order=c(3,1,1),period=12),
include.constant = FALSE, method = 'CSS')
},error= function(cond){
message('change the optimisation method to avoid some numerial problem with the default method')
message(cond)
model_i <- Arima(store_i[,'Sales'],
order = c(14,0,0),
seasonal=list(order=c(3,1,1),period=12),
include.constant = FALSE,
method = 'CSS')
}, warning = function(w){
message(w)
}, finally = {
print('passed')
}
)
predict_i <- forecast(model_i, h =6*6)$mean
# manually insert 0 for sunday sales
predict_i <- c(predict_i[1],0,
predict_i[2:7],0,
predict_i[8:13],0,
predict_i[14:19],0,
predict_i[20:25],0,
predict_i[26:31],0,
predict_i[32:36])
}else{ # Sunday sales
tryCatch(
#expr
{
model_i <- Arima(store_i[,'Sales'],
order = c(15,0,0),
seasonal=list(order=c(3,1,1),period=14),
include.constant = FALSE,method = 'CSS')
},error= function(cond){
message('change the optimisation method to avoid some numerial problem with the default method')
message(cond)
model_i <- Arima(store_i[,'Sales'],
order = c(15,0,0),
seasonal=list(order=c(3,1,1),period=14),
include.constant = FALSE,
method = 'CSS')
}, warning = function(w){
message(w)
}, finally = {
print('passed')
}
)
predict_i <- forecast(model_i, h =7*6)$mean
}
predict_df_i <- as.data.frame(cbind(store_list[i], predict_i))
colnames(predict_df_i) <- c('Store','Forecast_Sales')
predict_df_i$Date <- test_date
predict_df <- rbind(predict_df,predict_df_i)
}
ARIMA_preditc_df <- predict_df%>%left_join(test[,c('Date','Sales', 'Store')],
by =c('Store','Date'))
saveRDS(ARIMA_preditc_df,'ARIMA_preditc_df.rds')
}
ARIMA_preditc_df <- readRDS('ARIMA_preditc_df.rds')
#suppressWarnings(suppressMessages(library('performanceEstimation')))
regressionMetrics(real =ARIMA_preditc_df$Sales,
predicted =ARIMA_preditc_df$Forecast_Sales)
## MSE RMSE MAE MedAE R2
## 1 4832634 2198.325 1102.916 685.6279 0.6519236
As ARIMA is not good for a quite long period ahead forecast (RMSE is quite high compared with other machine learning models). Lets see how the error evolves over time
ARIMA_preditc_df%>%
group_by(Date)%>%
summarise(rmse = regressionMetrics(real =Sales,
predicted = Forecast_Sales)[,2])%>%
ggplot(aes(x=Date, y = rmse))+geom_line()+
ggtitle('rmse over time')
It can be seen that RMSE keeps increasing over time, so its not a very good idea or fair to use ARIMA to forecast a long period ahead.
ARIMA <-readRDS('ARIMA_preditc_df.rds')
XGBoost_pred <-readRDS('predictions.Rds')
## Error in gzfile(file, "rb"): cannot open the connection
test <- rossmann%>%
group_by(Store)%>%
filter(Date > split_date)
result <- rossmann%>%
filter(Date > split_date)%>%
dplyr::select(Date, Store, Sales)
## Adding missing grouping variables: `Year`
result$lm1 <- model1.pred
result$lm_sig <- model1A.pred
result$step_wise<-model1B.pred
result$regTree <- model2.pred
result$regTree_stw <- model2B.pred
result$XGBoost <- XGBoost_pred
## Error in eval(expr, envir, enclos): object 'XGBoost_pred' not found
result$ridge <- model5.pred
result$lasso <- model6.pred
result$ridge_crt <- model8
result$lasso_crt <- model9
result <- result%>%left_join(ARIMA[,c('Store','Date','Forecast_Sales')])
## Joining, by = c("Date", "Store")
result <- as.data.frame(result)
names(result)[15] <- 'ARIMA'
## Error in names(result)[15] <- "ARIMA": 'names' attribute [15] must be the same length as the vector [14]
result[,5:15] <- apply(result[,5:15],1:2, function(x) max(x,0))
## Error in `[.data.frame`(result, , 5:15): undefined columns selected
result <- result%>%
mutate(avg_all = rowMeans(.[,5:15]),
avg_best = rowMeans(.[,c('XGBoost', 'step_wise','lm_sig','lm1','lasso')]))
## Error in mutate_impl(.data, dots): Evaluation error: undefined columns selected.
result <- as.data.frame(result)
pef_metrics <- data.frame()
model_names <-names(result)[5:17]
for (i in 1:length(model_names)){
pef <- regressionMetrics(real = result$Sales ,
predicted = result[,model_names[i]])
pef$model_name <- model_names[i]
pef_metrics <- rbind(pef_metrics,pef)
}
## Error in `[.data.frame`(result, , model_names[i]): undefined columns selected
pef_metrics <- pef_metrics%>%arrange(RMSE)
ggplot(pef_metrics, aes(x= reorder(model_name, -RMSE),y=RMSE))+
geom_bar(stat = 'identity')+
xlab('Model name')+
ylab('RMSE')+
ggtitle('Performance of various models on test set - RMSE')+
coord_flip()
ggplot(pef_metrics, aes(x= reorder(model_name, -R2),y=R2))+
geom_bar(stat = 'identity')+
xlab('Model name')+
ylab('R2')+
ggtitle('Performance of various models on test set -R2')+
coord_flip()
*To sum up, XGBoost has the best predictive performance. It has the highest R2 = 92.8 % and lowest root mean square error equal to 1004.16. We were a litlle bit suprised, that Regression Trees have the worst results.
*We found, that ARIMA is only good for few period ahead forecast (the result in the test set is worse than it is in the train set.). The longer we forecast the less accurate it is, as the model’s prediction will simply converge to the unconditional mean.